BiocManager::install("rGREAT")
## Bioconductor version 3.20 (BiocManager 1.30.25), R 4.4.3 (2025-02-28)
## Warning: package(s) not installed when version(s) same as or greater than current; use
## `force = TRUE` to re-install: 'rGREAT'
## Old packages: 'data.table', 'generics', 'softImpute', 'zip'
suppressPackageStartupMessages({
library(GenomicRanges)
library(epiwraps)
library(ggplot2)
library(rGREAT) # Gene Ontology enrichment among genomic regions
library(MotifDb)
library(universalmotif)
library(ggplot2)
library(SummarizedExperiment) # data structure
library(sechm) # for plotting heatmaps from a SummrizedExperiment
library(BiocParallel) # for multithreading
library(chromVAR) # for motif accessibility estimation
library(limma) # for statistical analysis
library(TFBSTools)
library(Biostrings)
})
## See system.file("LICENSE", package="MotifDb") for use restrictions.
# to control multithreading, unix users can use:
register(MulticoreParam(4))
# for windows users, rather one of the following:
# register(SerialParam()) # this will disable multi-threading
# register(SnowParam(2))
system("which gfortran")
system("gfortran --version")
Sys.getenv("PATH")
## [1] "/Library/Frameworks/Python.framework/Versions/3.12/bin:/Library/Frameworks/Python.framework/Versions/3.10/bin:/Library/Frameworks/Python.framework/Versions/3.7/bin:/usr/local/bin:/System/Cryptexes/App/usr/bin:/usr/bin:/bin:/usr/sbin:/sbin:/var/run/com.apple.security.cryptexd/codex.system/bootstrap/usr/local/bin:/var/run/com.apple.security.cryptexd/codex.system/bootstrap/usr/bin:/var/run/com.apple.security.cryptexd/codex.system/bootstrap/usr/appleinternal/bin:/opt/X11/bin:/Library/TeX/texbin:/Users/florencemarti/Downloads/sratoolkit.3.2.1-mac-x86_64/bin:/Applications/quarto/bin:/usr/texbin:/Applications/RStudio.app/Contents/Resources/app/quarto/bin:/Applications/RStudio.app/Contents/Resources/app/bin/postback"
#1. Analysis ##1.a Download the data
options(timeout=6000)
download.file("https://ethz-ins.org/content/w10.assignment.zip", "archive.zip", mode="wb")
unzip("archive.zip")
list.files()
peaks <- list.files(pattern="bed$")
# we first import the peaks
peaks <- lapply(peaks, rtracklayer::import.bed)
# we'll focus on the high-quality peaks
peaks <- lapply(peaks, FUN=function(x) x[x$score>800])
# we get the union of non-redundant regions
regions <- reduce(unlist(GRangesList(peaks)))
tracks <- list.files(pattern = "bw$")
creb_bed_files <- grep("creb", peaks, value = TRUE, ignore.case = TRUE)
creb_bw_files <- grep("creb", tracks, value = TRUE, ignore.case = TRUE)
# Create a subset of peaks only for CREB-related BED files
creb_peaks <- peaks[peaks %in% creb_bed_files]
# Create a subset of tracks only for CREB-related BigWig files
creb_tracks <- tracks[tracks %in% creb_bw_files]
ese <- signal2Matrix(creb_tracks, regions, extend=2000)
## Reading Creb1.bw
## Reading Creb3.bw
## Reading Creb3L1.bw
plotEnrichedHeatmaps(ese)
ese2 <- ese[1:1000,]
plotEnrichedHeatmaps(ese2, cluster_rows = TRUE, show_row_dend=TRUE )
Use clustering and visualization to illustrate the relationship between the binding of the different proteins
cl2 <- clusterSignalMatrices(ese, k=2:10)
ggplot(cl2$varExplained, aes(k, varExplained)) + geom_line()
set.seed(123) # to ensure that it gives the same results everytime
cl <- clusterSignalMatrices(ese, k=7)
## ~79% of the variance explained by clusters
table(cl)
## cl
## 1 2 3 4 5 6 7
## 405 92 212 495 507 247 311
head(cl)
## [1] 5 4 1 1 1 4
## Levels: 1 2 3 4 5 6 7
length(cl)
## [1] 2269
length(regions)
## [1] 2269
# to make sure the cluster labels stay associated with the corresponding regions/rows
# even if we manipulate the object, put them inside the rowData of the object:
rowData(ese)$cluster <- cl
head(rowData(ese))
## DataFrame with 6 rows and 1 column
## cluster
## <factor>
## chr1:778487-779069 5
## chr1:904436-904985 4
## chr1:941785-942096 1
## chr1:943126-943501 1
## chr1:959088-959463 1
## chr1:961148-961904 4
mycolors <- c("1"="pink", "2"="lightblue", "3"="darkgreen", "4"="orange", "5"="purple", "6"="blue", "7"="lightgreen")
plotEnrichedHeatmaps(ese, row_split="cluster", mean_color=mycolors, colors=c("white","darkred"))
CREB1 and CREB3L1 have similar binding patterns, even though CREB3L1 is
closely related to CREB3.
d <- meltSignals(ese, splitBy=cl)
ggplot(d, aes(position, mean, colour=sample)) + geom_line(size=1.2) + facet_wrap(~split)
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
cl <- clusterSignalMatrices(ese, k=7, scaleRows = TRUE)
## ~94% of the variance explained by clusters
d <- meltSignals(ese, splitBy=cl)
ggplot(d, aes(position, mean, colour=sample)) + geom_line() + facet_wrap(~split)
plotEnrichedHeatmaps(ese, row_split = cl, scale_rows = "global")
Cluster 3
# we first split the regions by cluster:
split_regions <- split(rowRanges(ese), rowData(ese)$cluster)
lengths(split_regions)
## 1 2 3 4 5 6 7
## 405 92 212 495 507 247 311
res <- great(split_regions[["3"]], gene_sets="GO:BP", tss_source="hg38",
background=regions, cores=2)
## * TSS source: TxDb.
## * check whether TxDb package 'TxDb.Hsapiens.UCSC.hg38.knownGene' is installed.
## * gene ID type in the extended TSS is 'Entrez Gene ID'.
## * restrict chromosomes to 'chr1, chr2, chr3, chr4, chr5, chr6, chr7, chr8, chr9, chr10,
## chr11, chr12, chr13, chr14, chr15, chr16, chr17, chr18, chr19, chr20, chr21, chr22, chrX,
## chrY, chrM'.
## * 18110/30830 protein-coding genes left.
## * update seqinfo to the selected chromosomes.
## * TSS extension mode is 'basalPlusExt'.
## * construct the basal domains by extending 5000bp to upstream and 1000bp to downsteram of TSS.
## * calculate distances to neighbour regions.
## * extend to both sides until reaching the neighbour genes or to the maximal extension.
## * use GO:BP ontology with 15413 gene sets (source: org.Hs.eg.db).
## * check gene ID type in `gene_sets` and in `extended_tss`.
## * use self-defined background regions.
## * remove excluded regions from background.
## * overlap `gr` to background regions (based on midpoint).
## * in total 212 `gr`.
## * overlap extended TSS to background regions.
## * check which genes are in the gene sets.
## * only take gene sets with size >= 5.
## * in total 2480 gene sets.
## * overlap `gr` to every extended TSS.
## * perform binomial test for each biological term.
bp <- getEnrichmentTables(res)
head(bp)
## id description genome_fraction
## 1 GO:0046649 lymphocyte activation 0.1141667
## 2 GO:0050896 response to stimulus 0.6333121
## 3 GO:0051716 cellular response to stimulus 0.5674445
## 4 GO:0044237 cellular metabolic process 0.7306017
## 5 GO:0045321 leukocyte activation 0.1249278
## 6 GO:0032501 multicellular organismal process 0.5649184
## observed_region_hits fold_enrichment p_value p_adjust mean_tss_dist
## 1 45 1.859247 3.042332e-05 0.02664705 116862
## 2 160 1.191698 1.088204e-04 0.02664705 112782
## 3 147 1.221963 1.118256e-04 0.02664705 121534
## 4 178 1.149221 1.198092e-04 0.02664705 108666
## 5 46 1.736853 1.259111e-04 0.02664705 116214
## 6 146 1.219077 1.475089e-04 0.02664705 116126
## observed_gene_hits gene_set_size fold_enrichment_hyper p_value_hyper
## 1 24 72 1.819499 0.001248413
## 2 137 642 1.164820 0.004252981
## 3 117 542 1.178310 0.007385788
## 4 143 713 1.094762 0.048972072
## 5 25 82 1.664176 0.004049256
## 6 109 537 1.107963 0.074782394
## p_adjust_hyper
## 1 0.1024531
## 2 0.1545510
## 3 0.1683686
## 4 0.2517716
## 5 0.1545510
## 6 0.2916795
res_cc <- great(split_regions[["3"]], gene_sets="GO:CC", tss_source="hg38",
background=regions, cores=2)
## * TSS source: TxDb.
## * extended_tss is already cached, directly use it.
## * use GO:CC ontology with 1999 gene sets (source: org.Hs.eg.db).
## * check gene ID type in `gene_sets` and in `extended_tss`.
## * use self-defined background regions.
## * remove excluded regions from background.
## * overlap `gr` to background regions (based on midpoint).
## * in total 212 `gr`.
## * overlap extended TSS to background regions.
## * check which genes are in the gene sets.
## * only take gene sets with size >= 5.
## * in total 381 gene sets.
## * overlap `gr` to every extended TSS.
## * perform binomial test for each biological term.
cc <- getEnrichmentTables(res_cc)
head(cc)
## id description genome_fraction
## 1 GO:0071944 cell periphery 0.4400013
## 2 GO:0005886 plasma membrane 0.3999239
## 3 GO:0043226 organelle 0.9016456
## 4 GO:0043231 intracellular membrane-bounded organelle 0.8534896
## 5 GO:0005654 nucleoplasm 0.4206354
## 6 GO:0043229 intracellular organelle 0.8843347
## observed_region_hits fold_enrichment p_value p_adjust mean_tss_dist
## 1 120 1.286446 0.0001521473 0.01671530 123010
## 2 111 1.309211 0.0001803972 0.01671530 113750
## 3 205 1.072463 0.0002639258 0.01671530 114691
## 4 197 1.088760 0.0005360609 0.02546289 114879
## 5 112 1.255961 0.0010174977 0.02919191 101635
## 6 201 1.072120 0.0011372293 0.02919191 114567
## observed_gene_hits gene_set_size fold_enrichment_hyper p_value_hyper
## 1 93 412 1.232137 0.005291146
## 2 87 371 1.280025 0.002119558
## 3 196 1036 1.032689 0.180029741
## 4 175 912 1.047409 0.137611072
## 5 75 342 1.197039 0.029464253
## 6 184 985 1.019659 0.321605073
## p_adjust_hyper
## 1 0.11152344
## 2 0.08054319
## 3 0.52001858
## 4 0.52001858
## 5 0.27862703
## 6 0.61104964
ggplot(head(bp,10), aes(fold_enrichment, reorder(description, p_adjust),
size=observed_region_hits, color=-log10(p_adjust))) +
geom_point() + scale_color_viridis_c()
ggplot(head(cc,10), aes(fold_enrichment, reorder(description, p_adjust),
size=observed_region_hits, color=-log10(p_adjust))) +
geom_point() + scale_color_viridis_c()
The signal profiles in cluster 3 for the three transcription factors show distinct binding dynamics. CREB1 has the strongest enrichment peak, closely followed by CREB3L1. In contrast, CREB3 shows minimal binding activity. This suggests that CREB1 and CREB3L1 may play key roles in regulating the GO terms associated with cluster 3.
Looking at the GO:BP results for cluster 3, several highly enriched terms are related to synaptic vesicle dynamics, including synaptic vesicle recycling, synaptic vesicle endocytosis, and presynaptic endocytosis. More general terms like vesicle localisation also show up, suggesting a role in signalling processes.
Since signalling isn’t something I usually work with, I also checked the GO:CC terms to get a better sense of the cellular context. Among the top enriched terms were extrinsic component of membrane, cell cortex, and cell-substrate junction, along with mitochondrion and endoplasmic reticulum. These components line up well with the biological processes and support the idea that CREB1 and CREB3L1 might be involved in regulating endocytosis and vesicle trafficking in a signalling context.